November 2018
Built with R version 3.5.1
devtools::install_github("nicolaroberts/hdp", build_vignettes = TRUE)
ggplot2reshape2gridExtratibbleheatmap3prodlimsurvivalRColorBrewerIRdisplaylibrary('ggplot2')
library('reshape2')
library('gridExtra')
library('tibble')
library('heatmap3')
library('prodlim')
library('survival')
library('RColorBrewer')
library('IRdisplay')
source('utils/tools.R') # custom tools function
source('utils/hdp_tools.R') # hdp related functions
theme_set(theme_minimal())
# set jupyer notebook parameters
options(repr.plot.res = 100, # set a medium-definition resolution for the jupyter notebooks plots (DPI)
repr.matrix.max.rows = 200, # set the maximum number of rows displayed
repr.matrix.max.cols = 100) # set the maximum number of columns displayed
# get cytogenetics data
dd_cyto <- impact <- read.table("data/dd_cyto_cut15.tsv", sep = "\t", stringsAsFactors = FALSE, header = TRUE)
# get mutation data
dd_mutation <- read.table("data/dd_mutation_hotspot_cut10.tsv", sep = "\t", stringsAsFactors = FALSE, header = TRUE)
# merge cytogenetics and mutation data
cat(all(rownames(dd_mutation) == rownames(dd_cyto)), '\n\n') # check if same row ordering
dd_all <- cbind(dd_mutation, dd_cyto)
print_size(dd_all)
head(dd_all, 3)
164 patients don't have any event:
nrow(dd_all[apply(dd_all, 1, function(x) sum(x)) == 0,])
We run HDP using multiple independent posterior sampling chains and a initial number of raw cluster of 15 (initcc = 15).
All the functions used are defined in utils/hdp_tools.R and make use of Nicola Roberts HDP R package (see notebook header).
To follow the HDP terminology, in the rest of the notebook the term component is used for cluster and the term category is used for one gene/cytogenetic feature (ie we have 157 categories).
# initialise hdp
hdp <- initialise_hdp(dd_all)
# run multiple independent posterior sampling chains with different random seeds
number_of_chains <- 3
chain_list <- vector('list', number_of_chains)
for (i in 1:number_of_chains) {
seed <- i * 100
print_and_flush(sprintf('### Experiment %d (seed = %d) ###\n', i, seed))
# run single hdp chain
chain_list[[i]] <- activate_and_run_hdp(hdp,
initcc = 15,
burnin = 1000,
n = 300,
space = 20,
seed = seed)
print_and_flush('\n')
}
multi_output <- hdp_multi_chain(chain_list)
multi_output
# assess quality of posterior sampling chain
for (experiment in chains(multi_output))
plot_posterior_sampling_chain_quality(experiment, 15, 5)
# extract and plot components
multi_output <- extract_components(multi_output)
plot_components_size(multi_output)
WARNING: component 0 is a "garbage" component, not a real component.
plot_category_distribution_by_component(multi_output, colnames(dd_all))
# get top 7 categories for each component
get_top_categories_by_component(multi_output, colnames(dd_all), top_number = 7)
Let's get the probability prediction for each patient and each component. Notice that the 164 patients with no mutation/cytogenetics are NA rows.
dd_predicted <- get_prediction_result_dataframe(multi_output, dd_all)
head(dd_predicted)
n_components <- ncol(dd_predicted) - 2 - 1 # without columns 'component_0', 'predicted_component' and 'max_proba'
n_components
Component size in the dataset:
get_table(dd_predicted[,'predicted_component'])
plot_continous_feature_by_components <- function(data, feature_name, scale_y) {
set_notebook_plot_size(25, 7)
myLabeller <- function(x) {
x$predicted_component <- apply(x, 1, function(s) sprintf('Component %s (n = %d)', s, nrow(data[data$predicted_component == s,])))
return (x)
}
n_initial <- nrow(data)
data <- data[!is.na(data[feature_name]),]
p1 <- ggplot(data) + geom_boxplot(aes_string('predicted_component', feature_name, fill = 'predicted_component'), alpha = 0.7, notch = TRUE) + scale_fill_brewer(palette = 'Spectral', na.value = 'grey') +
ggtitle(sprintf('%s density by component\n(removed %d NA values)', feature_name, n_initial - nrow(data))) + theme(plot.title = element_text(size = 18, face = "bold"))
p2 <- ggplot(data) + geom_violin(aes_string('predicted_component', feature_name, fill = 'predicted_component'), alpha = 0.7) + scale_fill_brewer(palette = 'Spectral', na.value = 'grey') +
ggtitle(' \n ') + theme(plot.title = element_text(size = 18, face = "bold"))
p3 <- ggplot(data) + geom_density(aes_string(feature_name, fill = 'predicted_component'), alpha = 0.7) + scale_fill_brewer(palette = 'Spectral', na.value = 'grey') +
facet_wrap(~ predicted_component, ncol = 4, labeller = myLabeller) +
ggtitle(' \n ') + theme(plot.title = element_text(size = 18, face = "bold"))
if (scale_y) {
p1 <- p1 + scale_y_sqrt()
p2 <- p2 + scale_y_sqrt()
}
grid.arrange(p1, p2, p3, ncol = 3)
}
#suppressWarnings(plot_continous_feature_by_components(dd_clinical, 'HB', TRUE))
source('utils/hdp_tools.R')
plot_assignement_probability_by_component(dd_predicted)
plot_assignement_probability_distribution_by_component(dd_predicted, 25, 10)
# merge patient information dataframe with predicted probability and component
dd_clustered <- cbind(dd_all, dd_predicted)
dd_clustered <- rownames_to_column(dd_clustered, var = 'patient_key')
# sort by decreseasing component size and increasing assignemnt probability
categories_table <- get_table(dd_clustered[,'predicted_component'], add_total_count = FALSE)
dd_clustered$predicted_component <- factor(dd_clustered$predicted_component, levels = categories_table$values)
dd_clustered <- dd_clustered[order(dd_clustered$predicted_component, dd_clustered$max_proba),]
print_size(dd_clustered)
head(dd_clustered, 3)
# compute column separators position
col_sep <- c(cumsum(categories_table$count))
col_sep <- col_sep[-length(col_sep)] # remove last separator
# compute row separators position
row_sep <- c(58, 6, 15, 8, 11, 7, 2, 50)
row_sep_cum <- c(cumsum(row_sep))
row_sep_cum <- row_sep_cum[-length(row_sep_cum)]
# compute row colors vector
sep_color <- rev(c(rgb(180/255, 142/255, 84/255),
rgb(146/255, 187/255, 105/255),
rgb(233/255, 72/255 , 157/255),
rgb(72/255 , 169/255, 137/255),
rgb(226/255, 146/255, 102/255),
rgb(140/255, 137/255, 190/255),
rgb(238/255, 198/255, 116/255),
rgb(0/255, 0/255, 0/255)))
row_colors <- c()
for (i in 1:length(row_sep))
row_colors <- c(row_colors, rep(sep_color[i], row_sep[i]))
# manual gene grouping
col_order <- c(
'ARID1A', 'ARID2', 'BCORL1', 'BRCC3', 'CALR', 'CDKN1B', 'CDKN2A', 'CEBPA', 'CREBBP', 'CSF1R', 'CSF3R', 'CSNK1A1', 'DDX23', 'DDX4', 'DDX41',
'DDX54', 'DHX33', 'DICER1', 'DNMT3B', 'EED', 'EGFR', 'ETNK1', 'FAM175A', 'HIPK2', 'JAK2', 'JARID2', 'KDM5C', 'KMT2C', 'KMT2D', 'LUC7L2',
'MGA', 'MPL', 'NFE2', 'NIPBL', 'NOTCH2', 'NPM1', 'PAPD5', 'PHIP', 'PIK3CA', 'PRPF40B', 'PTPRF', 'RAD50', 'RASGRF1', 'RB1', 'ROBO1', 'ROBO2',
'RRAS', 'SETD2', 'SMG1', 'SPRED2', 'SRCAP', 'STAT3', 'SUZ12', 'TERT', 'YLPM1', 'ZBTB33', 'ZMYM3', 'ZNF318',
'ASXL1', 'BCOR', 'EZH2', 'RAD21', 'STAG2', 'ASXL2',
'ATRX', 'CUX1', 'GNAS', 'IRF1', 'KDM6A', 'KMT2A', 'NOTCH1', 'PHF6', 'SETBP1', 'WT1', 'SH2B3', 'SMC1A', 'EP300', 'SMC3', 'GNB1',
'BRAF', 'CBL', 'FLT3', 'KIT', 'KRAS', 'NF1', 'NRAS', 'PTPN11',
'DNMT3A', 'IDH1', 'IDH2_140', 'IDH2_172', 'TET2', 'ETV6', 'GATA1', 'GATA2', 'RUNX1', 'CTCF', 'MYC',
'SF3B1', 'SRSF2', 'U2AF1_157', 'U2AF1_34', 'U2AF2', 'ZRSR2', 'PRPF8',
'PPM1D', 'TP53',
'del1p', 'del3', 'del3q', 'del4q', 'del5', 'del5q', 'del7', 'del7q', 'del9q', 'del11p', 'del11q', 'del12', 'del12p', 'del13', 'del13q',
'del3p', 'del14', 'del15', 'del16', 'del16q', 'del17', 'del17p', 'del18', 'del20', 'del20q', 'del21', 'del22', 'delX', 'delY',
'plus1', 'plus1p', 'plus1q', 'plus8', 'plus9', 'plus11', 'plus11q', 'plus13', 'plus14', 'plus15', 'plus17p', 'plus17q', 'plus19', 'plus21', 'plus22',
'ring', 'mar', 'WGA', 'r_3_3', 'r_9_9', 'r_1_7'
)
length(col_order) # check that we have all 157 categories
# set a high-definition resolution for this plot (DPI)
options(repr.plot.res = 200)
set_notebook_plot_size(13, 13)
# plot the heatmap
heatmap3(t(as.matrix(dd_clustered[,col_order])),
# no dendogram, no scaling, small legend
Rowv = NA, Colv = NA, labCol = FALSE, scale = 'none', legendfun = function() showLegend(legend=c('Presence', 'Absence'), col=c('steelblue', 'lightgrey')),
col = c('lightgrey', 'steelblue'),
# set row colors
RowSideColors = row_colors,
RowAxisColors = 1,
# add horizontal and vertical lines (+ 0.5 to be at the exact separation)
add.expr = abline(v = col_sep + 0.5,
h = row_sep_cum + 0.5,
col = 'white', lwd = 1, lty = 'dotted'),
# row label size
cexRow = 0.4)
categories_table
# create a dataframe showing the count by component for each category
categories_repartition <- data.frame(category = colnames(dd_all))
for (i in 1:n_components)
categories_repartition[sprintf('component_%d', i)] <- apply(categories_repartition, 1, function(s) sum(dd_clustered[dd_clustered$predicted_component == i, s['category']]))
# sort categories_repartition by most frequent genes
categories_repartition['total_count'] <- rowSums(categories_repartition[,-1])
categories_repartition <- categories_repartition[order(categories_repartition$total_count, decreasing = TRUE),]
head(categories_repartition)
# plot category distribution colored by component count
# transform (extand) categories_repartition in a long data frame by gathering the components columns in one column
long_categories_repartition <- melt(categories_repartition, id = c('category', 'total_count'))
# sort long_gene_repartition by most frequent genes
long_categories_repartition$category <- factor(long_categories_repartition$category, levels = categories_repartition$category)
long_categories_repartition <- long_categories_repartition[order(long_categories_repartition$category),]
set_notebook_plot_size(25, 7)
# count plot
ggplot(long_categories_repartition) +
geom_bar(aes(category, value, fill = variable), stat = 'identity') +
tilt_x_label(70) +
scale_fill_brewer(palette = 'Spectral', na.value = 'grey')
# proportion plot
ggplot(long_categories_repartition) +
geom_bar(aes(category, value, fill = variable), stat = 'identity', position = 'fill') +
tilt_x_label(70) +
scale_fill_brewer(palette = 'Spectral', na.value = 'grey')
# plot category distribution by component
# sort long_gene_repartition by heatmap gene order
long_categories_repartition$category <- factor(long_categories_repartition$category, levels = col_order)
long_categories_repartition <- long_categories_repartition[order(long_categories_repartition$category),]
# plot category distribution by component
set_notebook_plot_size(25, 40)
ggplot(long_categories_repartition) + geom_bar(aes(category, value, fill = variable), stat='identity', alpha = 0.8) + tilt_x_label(70) + facet_wrap(~ variable, ncol = 1, scales = 'free')
# save dd_clustered
write.table(dd_clustered, file = 'data/dd_clustered.tsv', sep = '\t')
From Elsa:


# get clinical data
dd_clinical <- read.table("data/df_clinical_selected.tsv", sep = "\t", stringsAsFactors = FALSE, header = TRUE)
# merge with clustering data (that already contains mutation and cytogenetics data)
dd_clinical <- merge(dd_clustered, dd_clinical, by.x = 'patient_key', by.y = 'LEUKID')
print_size(dd_clinical)
head(dd_clinical, 2)
plot_continous_feature_by_components <- function(data, feature_name, scale_y_sqrt) {
# Plot the distribution of a continuous feature across the components, produce boxplot, violin plot and density plots side by side
# → Arguments
# - data : dataframe with one row per patient and one column `predicted_component` as well as one column for the continuous feature to plot
# - feature_name: name of the continuous feature to plot
# - scale_y_sqrt: wheter to plot with a sqrt y scale or not
set_notebook_plot_size(25, 7)
# custom labeller for facet_wrap for density plot
myLabeller <- function(x) {
x$predicted_component <- apply(x, 1, function(s) sprintf('Component %s (n = %d)', s, nrow(data[data$predicted_component == s,])))
return (x)
}
# remove NA values
n_initial <- nrow(data)
data <- data[!is.na(data[feature_name]),]
# boxplot
p1 <- ggplot(data) +
geom_boxplot(aes_string('predicted_component', feature_name, fill = 'predicted_component'), alpha = 0.7, notch = TRUE) +
scale_fill_manual(values = colorRampPalette(brewer.pal(11, "Spectral"))(12)) +
ggtitle(sprintf('%s density by component\n(removed %d NA values)', feature_name, n_initial - nrow(data))) +
theme(plot.title = element_text(size = 18, face = "bold"))
# violin plot
p2 <- ggplot(data) + geom_violin(aes_string('predicted_component', feature_name, fill = 'predicted_component'), alpha = 0.7) +
scale_fill_manual(values = colorRampPalette(brewer.pal(11, "Spectral"))(12)) +
ggtitle(' \n ') +
theme(plot.title = element_text(size = 18, face = "bold"))
# density plot
p3 <- ggplot(data) + geom_density(aes_string(feature_name, fill = 'predicted_component'), alpha = 0.7) +
facet_wrap(~ predicted_component, ncol = 4, labeller = myLabeller) +
scale_fill_manual(values = colorRampPalette(brewer.pal(11, "Spectral"))(12)) +
ggtitle(' \n ') +
theme(plot.title = element_text(size = 18, face = "bold"))
if (scale_y_sqrt) {
p1 <- p1 + scale_y_sqrt()
p2 <- p2 + scale_y_sqrt()
}
# plot the three plots side by side and remove ggplot warnings
suppressWarnings(suppressMessages(grid.arrange(p1, p2, p3, ncol = 3)))
}
# plot_continous_feature_by_components(dd_clinical, 'HB', TRUE)
# plot the distribution of multiple continuous features accross components
for (feature_name in c('HB', 'PLT', 'ANC', 'WBC', 'MONOCYTES', 'PB_BLAST', 'BM_BLAST', 'RINGED_SIDEROBLASTS')) {
plot_continous_feature_by_components(dd_clinical, feature_name, TRUE)
}
plot_continous_feature_by_components(dd_clinical, 'AGE_AT_SAMPLE_TIME', FALSE)
plot_categorical_feature_by_components <- function(data, feature_name) {
# Plot the distribution of a categorical feature across the components
# → Arguments
# - data : dataframe with one row per patient and one column `predicted_component` as well as one column for the categorical feature to plot
# - feature_name: name of the categorical feature to plot
set_notebook_plot_size(25, 5)
# remove NA values
n_initial <- nrow(data)
data <- data[!is.na(data[feature_name]),]
# categorical feature distribution for each component
p1 <- ggplot(data) +
geom_bar(aes_string('predicted_component', fill = feature_name), position = 'fill') +
ggtitle(sprintf('%s count and proportion by component\n(removed %d NA values)', feature_name, n_initial - nrow(data))) +
theme(plot.title = element_text(size = 18, face = "bold"))
# categorical feature distribution by component
p2 <- ggplot(data) +
geom_bar(aes_string(feature_name, fill = 'predicted_component'), position = 'fill') +
scale_fill_manual(values = colorRampPalette(brewer.pal(11, "Spectral"))(12)) +
ggtitle(' \n ') +
theme(plot.title = element_text(size = 18, face = "bold"))
# categorical feature count by component
p3 <- ggplot(data) +
geom_bar(aes_string(feature_name, fill = 'predicted_component'), position = 'dodge') +
scale_fill_manual(values = colorRampPalette(brewer.pal(11, "Spectral"))(12)) +
ggtitle(' \n ') +
theme(plot.title = element_text(size = 18, face = "bold"))
# plot the three plots side by side and remove ggplot warnings
suppressWarnings(suppressMessages(grid.arrange(p1, p2, p3, ncol = 3)))
}
# plot_categorical_feature_by_components(dd_clinical, 'AML')
# plot the distribution of multiple categorical features accross components
for (feature_name in c('AML', 'WHO_2008_SIMPLIFY')) {
suppressWarnings(plot_categorical_feature_by_components(dd_clinical, feature_name))
}
Work in progress...
set_notebook_plot_size(10, 10)
par(mar = c(3.0, 1.0, 1.0, 1.0))
plot(prodlim(Hist(os_diag_years, os_status) ~ predicted_component, data = dd_clinical[dd_clinical$predicted_component %in% 1:9,], conf.int = FALSE), , col = brewer.pal(10, 'Spectral'))
plot(prodlim(Hist(os_diag_years, os_status) ~ predicted_component, data = dd_clinical[dd_clinical$predicted_component %in% c(NaN),], conf.int = FALSE), , col = 'grey', add = TRUE)
print_p_value_from_survival_curve <- function(data, column_name, value_1_name, value_2_name) {
# work in progress...
stmp <- survdiff(as.formula(sprintf('Surv(os_diag_years, os_status) ~ %s', column_name)), data = data[data[,column_name] %in% c(value_1_name, value_2_name),])
pval <- 1 - pchisq(stmp$chisq, length(stmp$n) - 1)
print_and_flush(sprintf('%-21s and %21s: p = %.3f\n', value_1_name, value_2_name, pval))
}
plot_survival_curve <- function(dd_clinical, category_name, component) {
# work in progress...
display_markdown(sprintf('**%s on components %s**', category_name, paste(component, collapse = ', ')))
extra_col_name <- sprintf('%s_component', category_name)
dd_clinical[extra_col_name] <- 'wildtype'
dd_clinical[dd_clinical[category_name] == 1, extra_col_name] <- sprintf('%s_other', category_name)
for (i in component)
dd_clinical[dd_clinical[category_name] == 1 & dd_clinical$predicted_component == i, extra_col_name] <- sprintf('%s_component_%d', category_name, i)
print_and_flush('## count table\n')
print(get_table(dd_clinical[extra_col_name], add_total_count = FALSE), row.names = FALSE)
print_and_flush('\n')
print_and_flush('## p-values\n')
categories <- as.character(get_table(dd_clinical[extra_col_name], add_total_count = FALSE)$values)
pairs <- combn(categories, 2)
for (i in seq(1, length(pairs), 2))
print_p_value_from_survival_curve(dd_clinical, extra_col_name, pairs[i], pairs[i + 1])
print_and_flush('\n')
set_notebook_plot_size(7, 7)
par(mar = c(3.0, 1.0, 1.0, 1.0))
sr <- prodlim(as.formula(sprintf('Hist(os_diag_years, os_status) ~ %s', extra_col_name)), data = dd_clinical)
plot(sr, confint = FALSE, col = brewer.pal(length(categories), 'Spectral'), legend.title = '', legend.x = 'topright')
Sys.sleep(0.2)
}
plot_survival_curve(dd_clinical, 'SF3B1', c(4, 6))
plot_survival_curve(dd_clinical, 'SRSF2', c(1, 3, 5))
plot_survival_curve(dd_clinical, 'U2AF1_157', c(5, 7))
plot_survival_curve(dd_clinical, 'U2AF1_34', c(8))
plot_survival_curve(dd_clinical, 'ZRSR2', c(1, 7))
plot_survival_curve(dd_clinical, 'del5q', c(2, 6))
plot_survival_curve(dd_clinical, 'TP53', c(2, 6))